In this article, we will build hedonic pricing models to examine the effect of different factors such as proxmity to certain facilities and number of facilities within a certain radius. The hedonic pricing models will be built using Geographical Weighted Regression.
packages = c('olsrr', 'corrplot', 'ggpubr', 'sf', 'spdep', 'GWmodel', 'tmap', 'tidyverse', 'httr', 'jsonlite', 'matrixStats', 'raster', 'geosphere', 'units')
for (p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p,character.only = T)
}
The following datasets will be used for our analysis:
MP14_SUBZONE_WEB_PL is the URA Master Plan planning subzone boundaries.
mpsz <- st_read(dsn = "data/geospatial", layer = "MP14_SUBZONE_WEB_PL") %>%
st_transform(3414)
Reading layer `MP14_SUBZONE_WEB_PL' from data source
`C:\darrylkwok\IS415_blog\_posts\2021-10-22-take-home-exercise-3\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 323 features and 15 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
Projected CRS: SVY21
child-care-services-geojson is the geospatial information of the Childcare Services in Singapore.
childcare_sf <- st_read("data/geospatial/child-care-services-geojson.geojson") %>%
st_transform(crs = 3414)
Reading layer `child-care-services-geojson' from data source `C:\darrylkwok\IS415_blog\_posts\2021-10-22-take-home-exercise-3\data\geospatial\child-care-services-geojson.geojson'
using driver `GeoJSON'
Simple feature collection with 1545 features and 2 fields
Geometry type: POINT
Dimension: XYZ
Bounding box: xmin: 103.6824 ymin: 1.248403 xmax: 103.9897 ymax: 1.462134
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
ELDERCARE is the geospatial information of the Elderly Care Services in Singapore.
eldercare_sf <- st_read(dsn="data/geospatial", layer = "ELDERCARE") %>%
st_transform(3414)
Reading layer `ELDERCARE' from data source
`C:\darrylkwok\IS415_blog\_posts\2021-10-22-take-home-exercise-3\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 133 features and 18 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 14481.92 ymin: 28218.43 xmax: 41665.14 ymax: 46804.9
Projected CRS: SVY21
MRTLRTStnPtt is the geospatial information of the MRT and LRT Stations in Singapore.
mrtlrt_sf <- st_read(dsn="data/geospatial", layer = "MRTLRTStnPtt") %>%
st_transform(3414)
Reading layer `MRTLRTStnPtt' from data source
`C:\darrylkwok\IS415_blog\_posts\2021-10-22-take-home-exercise-3\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 171 features and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 6138.311 ymin: 27555.06 xmax: 45254.86 ymax: 47854.2
Projected CRS: SVY21
parks-geojson is the geospatial information of the Parks in Singapore.
parks_sf <- st_read("data/geospatial/parks-geojson.geojson") %>%
st_transform(3414)
Reading layer `parks-geojson' from data source
`C:\darrylkwok\IS415_blog\_posts\2021-10-22-take-home-exercise-3\data\geospatial\parks-geojson.geojson'
using driver `GeoJSON'
Simple feature collection with 350 features and 2 fields
Geometry type: POINT
Dimension: XYZ
Bounding box: xmin: 103.6929 ymin: 1.214058 xmax: 104.0017 ymax: 1.461503
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
kindergartens-geojson is the geospatial information of the Kindergartens in Singapore.
kinder_sf <- st_read("data/geospatial/kindergartens-geojson.geojson") %>%
st_transform(3414)
Reading layer `kindergartens-geojson' from data source
`C:\darrylkwok\IS415_blog\_posts\2021-10-22-take-home-exercise-3\data\geospatial\kindergartens-geojson.geojson'
using driver `GeoJSON'
Simple feature collection with 448 features and 2 fields
Geometry type: POINT
Dimension: XYZ
Bounding box: xmin: 103.6887 ymin: 1.247759 xmax: 103.9717 ymax: 1.455452
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
hawker-centres-geojson is the geospatial information of the Hawker Centres in Singapore.
hawkers_sf <- st_read("data/geospatial/hawker-centres-geojson.geojson") %>%
st_transform(3414)
Reading layer `hawker-centres-geojson' from data source
`C:\darrylkwok\IS415_blog\_posts\2021-10-22-take-home-exercise-3\data\geospatial\hawker-centres-geojson.geojson'
using driver `GeoJSON'
Simple feature collection with 125 features and 2 fields
Geometry type: POINT
Dimension: XYZ
Bounding box: xmin: 103.6974 ymin: 1.272716 xmax: 103.9882 ymax: 1.449217
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
supermarkets-geojson is the geospatial information of the Supermarkets in Singapore.
supermkts_sf <- st_read("data/geospatial/supermarkets-geojson.geojson") %>%
st_transform(3414)
Reading layer `supermarkets-geojson' from data source
`C:\darrylkwok\IS415_blog\_posts\2021-10-22-take-home-exercise-3\data\geospatial\supermarkets-geojson.geojson'
using driver `GeoJSON'
Simple feature collection with 526 features and 2 fields
Geometry type: POINT
Dimension: XYZ
Bounding box: xmin: 103.6258 ymin: 1.24715 xmax: 104.0036 ymax: 1.461526
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
busStop is the geospatial information of the BusStops in Singapore.
busstops_sf <- st_read(dsn="data/geospatial", layer = "BusStop")%>%
st_transform(3414)
Reading layer `BusStop' from data source
`C:\darrylkwok\IS415_blog\_posts\2021-10-22-take-home-exercise-3\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 5156 features and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 4427.938 ymin: 26482.1 xmax: 48282.5 ymax: 52983.82
Projected CRS: SVY21
The actions taken will be:
geo_preprocess <- function(df, add_col) {
df <- df[!duplicated(df[, add_col]), ] %>%
st_make_valid()
return(df)
}
The Geospatial Pre-processing function will be executed on all the Geospatial Data imported.
mpsz <- geo_preprocess(mpsz, "geometry")
childcare_sf <- geo_preprocess(childcare_sf, "geometry")
eldercare_sf <- geo_preprocess(eldercare_sf, "ADDRESSPOS")
mrtlrt_sf <- geo_preprocess(mrtlrt_sf, "STN_NAME")
parks_sf <- geo_preprocess(parks_sf, "geometry")
kinder_sf <- geo_preprocess(kinder_sf, "geometry")
hawkers_sf <- geo_preprocess(hawkers_sf, "geometry")
supermkts_sf <- geo_preprocess(supermkts_sf, "geometry")
busstops_sf <- geo_preprocess(busstops_sf, "geometry")
We will make use of the read_csv function to read the resale flat prices that is obtained from data.gov.sg
resale_prices <- read_csv("data/aspatial/resale-flat-prices-jan-2019-to-sept-2020.csv")
Since there are occurrences of “ST.” in the CSV file, we will replace the “ST.” occurrences to “SAINT”
resale_prices$street_name <- gsub("ST\\.", "SAINT", resale_prices$street_name)
We can see that the data read in from the CSV file, does not contain any coordinates. Therefore we will have to perform geocoding.
This function calls the search function of the commonapi of OneMap.
Since the search function of the API does not require a token, we will not use a token here.
The steps taken are:
geocode_addr <- function(block, street_name) {
url <- "https://developers.onemap.sg/commonapi/search"
search_addr <- paste(block, street_name, sep = " ")
query <- list("searchVal" = search_addr, "returnGeom" = "Y", "getAddrDetails" = "N", "pageNum" = "1")
res <- GET(url, query = query)
jsonRespText<-content(res,as="text")
output <- fromJSON(jsonRespText) %>%
as.data.frame %>%
select(results.LATITUDE, results.LONGITUDE)
return(output)
}
Execute the Geocoding function defined above to all the rows within the dataset imported from resale-flat-prices-jan-2019-to-sept-2020.csv
resale_prices$LATITUDE <- 0
resale_prices$LONGITUDE <- 0
for (i in 1:nrow(resale_prices)) {
temp_output <- geocode_addr(resale_prices[i, 4], resale_prices[i, 5])
resale_prices$LATITUDE[i] <- temp_output$results.LATITUDE
resale_prices$LONGITUDE[i] <- temp_output$results.LONGITUDE
}
This function converts the actual remaining lease period from a string format to a double format
Steps Taken: - Split the string within the remaining_lease column. - If the string length is 4, it contains months. Else it contains years only. - Output the final value as a double.
cal_remain_lease <- function(each_col) {
entry_list <- unlist(strsplit(each_col, '\\s+')[[1]])
if (length(entry_list) > 2) {
years <- as.numeric(unlist(entry_list[1]))
months <-as.numeric(unlist(entry_list[3]))
total_dur <- years + round((months/12), 2)
} else {
years <- as.numeric(unlist(entry_list[1]))
total_dur <- years
}
return(unlist(total_dur))
}
Calculate the remaining lease duration for each record by executing the function defined above.
resale_prices$remaining_lease <- sapply(resale_prices$remaining_lease, cal_remain_lease)
The Downtown Core is one of the central districts of Singapore. Located in the Marina Bay Area, in the SouthWest part of the country.
After creating the dataframe, convert the dataframe into a sf object
lat <- 1.287953
long <- 103.851784
cbd_coor_sf <- data.frame(lat, long) %>%
st_as_sf(., coords = c("long", "lat"), crs=4326) %>%
st_transform(crs=3414)
This function is to find the distance to the nearest facility
Steps Taken: - Make use of st_distance method to calculate the distances between all the HDBs and the facility in question. - A matrix will be generated from the function st_function - rowMins is used to get the shortest distance for each row within the matrix.
prox_prep <- function(prim_df, sec_df, var_name) {
dist_matrix <- st_distance(prim_df, sec_df) %>%
drop_units()
prim_df[,var_name] <- rowMins(dist_matrix)
return(prim_df)
}
This function is to count the number of facility within a radius
Steps Taken: - Make use of st_distance method to calculate the distances between all the HDBs and the facility in question. - Convert the matrix generated into a dataframe. - Filter out and count the number of facilities that are within a certain radius.
num_prep <- function(prim_df, sec_df, radius, var_name) {
dist_matrix <- st_distance(prim_df, sec_df) %>%
drop_units()
dist_matrix <- data.frame(dist_matrix)
prim_df[,var_name] <- rowSums(dist_matrix <= radius)
return(prim_df)
}
Create Facility Proximity columns
resale_prices_sf <- prox_prep(resale_prices_sf, eldercare_sf, "PROX_ELDER") %>%
prox_prep(., mrtlrt_sf, "PROX_MRTLRT") %>%
prox_prep(., parks_sf, "PROX_PARK") %>%
prox_prep(., hawkers_sf, "PROX_HAWKER") %>%
prox_prep(., cbd_coor_sf, "PROX_CBD") %>%
prox_prep(., supermkts_sf, "PROX_SPRMKTS")
Create number of Facilities columns
resale_prices_sf <- num_prep(resale_prices_sf, childcare_sf, 350, "NUM_CC") %>%
num_prep(., kinder_sf, 350, "NUM_KINDER") %>%
num_prep(., busstops_sf, 350, "NUM_BUSSTOPS")
In order to save time having to rerun the pre-processing functions again, we will write the dataframe to a csv and read in when we need it.
Note: This is done after all the preprocessing and creation of new columns
After writing the sf object to a shapefile, the column names will be truncated.
st_write(resale_prices_sf, "data/final_resale_info.shp")
Read in the shapefile that contains all the information created earlier
resale_prices_sf <- st_read(dsn="data", layer="final_resale_info")
Reading layer `final_resale_info' from data source
`C:\darrylkwok\IS415_blog\_posts\2021-10-22-take-home-exercise-3\data'
using driver `ESRI Shapefile'
Simple feature collection with 15901 features and 20 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 11597.31 ymin: 28217.39 xmax: 42623.63 ymax: 48741.06
Projected CRS: SVY21 / Singapore TM
Visualise the different columns of the resale_prices_sf
glimpse(resale_prices_sf)
Rows: 15,901
Columns: 21
$ month <chr> "2019-01", "2019-01", "2019-01", "2019-01~
$ town <chr> "ANG MO KIO", "ANG MO KIO", "ANG MO KIO",~
$ flt_typ <chr> "4 ROOM", "4 ROOM", "4 ROOM", "4 ROOM", "~
$ block <chr> "204", "175", "543", "118", "411", "546",~
$ strt_nm <chr> "ANG MO KIO AVE 3", "ANG MO KIO AVE 4", "~
$ stry_rn <chr> "01 TO 03", "07 TO 09", "01 TO 03", "04 T~
$ flr_r_s <dbl> 92, 91, 92, 99, 92, 92, 92, 92, 93, 91, 9~
$ flt_mdl <chr> "New Generation", "New Generation", "New ~
$ ls_cmm_ <dbl> 1977, 1981, 1981, 1978, 1979, 1981, 1978,~
$ rmnng_l <dbl> 57.00, 61.50, 61.08, 58.33, 59.58, 61.00,~
$ rsl_prc <dbl> 330000, 360000, 370000, 375000, 380000, 3~
$ PROX_CB <dbl> 8824.749, 9841.309, 9560.780, 9609.731, 8~
$ PROX_EL <dbl> 251.4065, 631.8448, 1082.4168, 347.3195, ~
$ PROX_MR <dbl> 703.9715, 403.4297, 889.9529, 200.9786, 8~
$ PROX_PA <dbl> 745.0859, 429.4870, 780.0777, 177.6163, 9~
$ PROX_HA <dbl> 441.82653, 269.72560, 258.29513, 436.4751~
$ PROX_SP <dbl> 270.8222, 310.1889, 318.7560, 458.6748, 3~
$ NUM_CC <dbl> 6, 5, 2, 3, 3, 2, 3, 4, 3, 2, 4, 4, 4, 5,~
$ NUM_KIN <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,~
$ NUM_BUS <dbl> 8, 8, 8, 7, 6, 9, 6, 6, 5, 4, 10, 5, 6, 9~
$ geometry <POINT [m]> POINT (29179.92 38822.08), POINT (2~
Here the truncated column names will be rename for a better understanding of the columns.
resale_prices_sf <- resale_prices_sf %>%
rename(floor_area_sqm = flr_r_s,
remaining_lease = rmnng_l,
resale_price = rsl_prc,
PROX_CBD = PROX_CB,
PROX_ELDER = PROX_EL,
PROX_MRTLRT = PROX_MR,
PROX_PARK = PROX_PA,
PROX_HAWKER = PROX_HA,
PROX_SPRMRT = PROX_SP)
EDA is done before any modeling or steps to get a better understanding of all the variables.
Summary Statistics for Resale Prices
summary(resale_prices_sf$resale_price)
Min. 1st Qu. Median Mean 3rd Qu. Max.
218000 353000 405000 433589 470000 1186888
Box plot for Resale Prices
ggplot(resale_prices_sf, aes(x = '', y = resale_price)) +
geom_boxplot() +
labs(x='', y='Resale Price') +
theme_minimal()

ggplot(resale_prices_sf, aes(x = resale_price)) +
geom_histogram(fill = 'darksalmon') +
labs(title = "Distribution of Resale Prices",
x = "Resale Prices",
y = 'Frequency') +
theme_minimal()

The independent variables consists of:
AREA_SQM <- ggplot(resale_prices_sf, aes(x = floor_area_sqm)) +
geom_histogram(bins=20, fill = 'lightblue') +
theme_minimal()
REMAIN_LEASE <- ggplot(resale_prices_sf, aes(x = remaining_lease)) +
geom_histogram(bins=20, fill = 'lightblue') +
theme_minimal()
PROX_CBD <- ggplot(resale_prices_sf, aes(x = PROX_CBD)) +
geom_histogram(bins=20, fill = 'lightblue') +
theme_minimal()
PROX_ELDER <- ggplot(resale_prices_sf, aes(x = PROX_ELDER)) +
geom_histogram(bins=20, fill = 'lightblue') +
theme_minimal()
PROX_MRTLRT <- ggplot(resale_prices_sf, aes(x = PROX_MRTLRT)) +
geom_histogram(bins=20, fill = 'lightblue') +
theme_minimal()
PROX_PARK <- ggplot(resale_prices_sf, aes(x = PROX_PARK)) +
geom_histogram(bins=20, fill = 'lightblue') +
theme_minimal()
PROX_HAWKER <- ggplot(resale_prices_sf, aes(x = PROX_HAWKER)) +
geom_histogram(bins=20, fill = 'lightblue') +
theme_minimal()
PROX_SPRMRT <- ggplot(resale_prices_sf, aes(x = PROX_SPRMRT)) +
geom_histogram(bins=20, fill = 'lightblue') +
theme_minimal()
NUM_CC <- ggplot(resale_prices_sf, aes(x = NUM_CC)) +
geom_histogram(bins=20, fill = 'lightblue') +
theme_minimal()
NUM_KIN <- ggplot(resale_prices_sf, aes(x = NUM_KIN)) +
geom_histogram(bins=20, fill = 'lightblue') +
theme_minimal()
NUM_BUS <- ggplot(resale_prices_sf, aes(x = NUM_BUS)) +
geom_histogram(bins=20, fill = 'lightblue') +
theme_minimal()
ggarrange(AREA_SQM, REMAIN_LEASE, PROX_CBD, PROX_ELDER, PROX_MRTLRT, PROX_PARK, PROX_HAWKER, PROX_SPRMRT, NUM_CC, NUM_KIN, NUM_BUS, ncol = 3, nrow = 4)

Create a new dataframe called towns_avg. This dataframe consists of the average Resale Price of each town.
After calculating the average, rescale the resale price by dividing it by 100000. This is to ensure that the values are more easily understandable when plotted.
top10_price = top_n(towns_avg, 10, resale_price)
ggplot(top10_price, aes(x=resale_price, y=reorder(town, resale_price), label=round(resale_price, 2))) +
geom_col(fill='darksalmon') +
labs(title='Top 10 Towns with the highest Average Resale Prices',
x='Resale Price ($100000)',
y='Town') +
geom_text(nudge_x=0.01, colour='gray23', size=3.5) +
theme_minimal()

This is to reveal the Geospatial distribution of the Resale Prices in Singapore.
tmap_mode("view")
tm_shape(mpsz)+
tm_polygons() +
tm_shape(resale_prices_sf) +
tm_dots(col = "resale_price",
alpha = 0.6,
style="quantile") +
tm_view(set.zoom.limits = c(11,14))